Remark. This homework aims to help you further understand the model selection techniques in linear model. Credits for Theoretical Part and Computational Part are in total 100 pt. For Computational Part , please complete your answer in the RMarkdown file and summit your printed PDF homework created by it.
1.(10 pt) Consider the Nba data posted on the Sakai class site. Create a new data frame that contains the following columns:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ tibble 2.0.0 ✔ purrr 0.2.5
## ✔ tidyr 0.8.2 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.3.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ purrr::accumulate() masks foreach::accumulate()
## ✖ tidyr::expand() masks Matrix::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ purrr::when() masks foreach::when()
nba = read.csv("nba-teams-2017.csv")
nba1 = nba %>% select(team, wins, points, points3, free_throws, off_rebounds, def_rebounds, assists, steals, personal_fouls)
example_bball_long <- nba1 %>%
gather(metric, value, -team)
ggplot(example_bball_long, aes(x = metric, y = value)) +
geom_boxplot(fill = "#7BAFD4", colour = "#13294B", notch = TRUE) +
labs(title = "Box plots of the selected features",
x = "",
y = "") +
theme_minimal() + theme(axis.text.x = element_text(angle = 45, hjust = 1))
The data needs to be scaled because of the varying distributions for the variables. This can be seen on the boxplot where some plots can hardly be seen because of the scale of the data.
teams = nba1$team
standardized_nba = prcomp(nba1 %>%select(-team), scale = T)
standardized_nba$rotation[,1:4]
## PC1 PC2 PC3 PC4
## wins -0.42366308 0.07555818 -0.11681772 0.21657745
## points -0.50246947 -0.21708761 0.19677723 -0.07946391
## points3 -0.41664778 0.17268215 -0.08316881 -0.51204012
## free_throws -0.24452950 -0.41519726 0.30946852 -0.33272209
## off_rebounds 0.08111297 -0.39160091 0.47926467 0.46463787
## def_rebounds -0.26053718 0.26461371 0.57662166 0.15459073
## assists -0.45236958 0.05322237 -0.26482231 0.27220000
## steals -0.20525546 -0.41895791 -0.45168498 0.37698249
## personal_fouls 0.11583180 -0.58585494 -0.09277492 -0.34335891
nba_variance = standardized_nba$sdev^2
prop_var_expl = nba_variance/sum(nba_variance)
plot(prop_var_expl, xlab = "PC", ylab = "Proportion Variance Explained", ylim = c(0,1) )
plot(cumsum(prop_var_expl), xlab = "PC", ylab = "Proportion Variance Explained", ylim = c(0,1) )
(e) If you were to retain all PCs which explain at least 90% of the variance, how many PCs would you retain?
plot(cumsum(prop_var_expl), xlab = "PC", ylab = "Proportion Variance Explained", ylim = c(0,1) )
abline(h = .9, col = "red")
As can be seen in the plot above, 6 PC scores need to be retained in order to have >90% of the variance explained.
scores = standardized_nba$x
plot_data = scores %>%
data.frame() %>%
mutate(team = nba1$team) %>%
select(team, everything())
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
ggplot(plot_data, aes(x = PC1, y = PC2)) +
geom_vline(xintercept = 0) +
geom_hline(yintercept = 0) +
geom_text(aes(label = team), size = 2) +
scale_x_continuous(breaks = -10:10) +
coord_cartesian(xlim = c(-8, 4)) +
theme_light()
?readPNG to remind yourself of the organization of the raster array output of that function.Answer the following questions:
pic = "Redfin_house.png"
picRGB = readPNG(pic)
picPlot = as.raster(picRGB)
grid.raster(picPlot)
X = picRGB[,,1]
pic.pr = prcomp(X, scale =T)
dim(pic.pr$rotation)
## [1] 798 505
pic_variance = pic.pr$sdev^2
prop_var_expl = pic_variance/sum(pic_variance)
par(mfrow = c(1,2))
plot(prop_var_expl, xlab = "PC", ylab = "Proportion Variance Explained", ylim = c(0,1) )
plot(cumsum(prop_var_expl), xlab = "PC", ylab = "Proportion Variance Explained", ylim = c(0,1), xlim = c(20,30) )
abline(h = .9, col = "red")
As can be seen from the plot, 24 PC scores are needed to account for >= 90% of the data.
loading = pic.pr$rotation
pc.image = list()
num_pc = c(1, 5, 10, 15, 20, 30, 50, 100, 200)
image = scale(X)
for(i in 1:length(num_pc)){
u.proj = loading
u.proj[, (num_pc[i] + 1) : 505] <- 0
projection <- (image%*%u.proj)%*%t(u.proj)
scaled <- (projection - min(as.numeric(projection)))
scaled <- scaled / max(as.numeric(scaled))
pc.image[[i]] <- as.raster(scaled)
}
grid.raster(pc.image[[1]], name = "PC 1")
grid.raster(pc.image[[2]])
grid.raster(pc.image[[3]])
grid.raster(pc.image[[4]])
grid.raster(pc.image[[5]])
grid.raster(pc.image[[6]])
grid.raster(pc.image[[7]])
grid.raster(pc.image[[8]])
grid.raster(pc.image[[9]])
College data set from ISLR package.# YOUR CODE HERE (uncomment first of course)
# YOUR CODE HERE (uncomment first of course)
# YOUR CODE HERE (uncomment first of course)
# YOUR CODE HERE (uncomment first of course)
# YOUR CODE HERE (uncomment first of course)
# YOUR CODE HERE (uncomment first of course)